{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Address Matching with k-Nearest Neighbors\n",
    "\n",
    "\n",
    "This function illustrates a way to perform address matching between two data sets.\n",
    "\n",
    "For each test address, we will return the closest reference address to it.\n",
    "\n",
    "We will consider two distance functions:\n",
    "1. Edit distance for street number/name and\n",
    "2. Euclidian distance (L2) for the zip codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import random\n",
    "import string\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from tensorflow.python.framework import ops\n",
    "ops.reset_default_graph()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### First we generate the data sets we will need"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# n = Size of created data sets\n",
    "n = 10\n",
    "street_names = ['abbey', 'baker', 'canal', 'donner', 'elm']\n",
    "street_types = ['rd', 'st', 'ln', 'pass', 'ave']\n",
    "\n",
    "random.seed(31)  #make results reproducible\n",
    "rand_zips = [random.randint(65000,65999) for i in range(5)]\n",
    "\n",
    "# Function to randomly create one typo in a string w/ a probability\n",
    "def create_typo(s, prob=0.75):\n",
    "    if random.uniform(0,1) < prob:\n",
    "        rand_ind = random.choice(range(len(s)))\n",
    "        s_list = list(s)\n",
    "        s_list[rand_ind]=random.choice(string.ascii_lowercase)\n",
    "        s = ''.join(s_list)\n",
    "    return(s)\n",
    "\n",
    "# Generate the reference dataset\n",
    "numbers = [random.randint(1, 9999) for i in range(n)]\n",
    "streets = [random.choice(street_names) for i in range(n)]\n",
    "street_suffs = [random.choice(street_types) for i in range(n)]\n",
    "zips = [random.choice(rand_zips) for i in range(n)]\n",
    "full_streets = [str(x) + ' ' + y + ' ' + z for x,y,z in zip(numbers, streets, street_suffs)]\n",
    "reference_data = [list(x) for x in zip(full_streets,zips)]\n",
    "\n",
    "# Generate test dataset with some typos\n",
    "typo_streets = [create_typo(x) for x in streets]\n",
    "typo_full_streets = [str(x) + ' ' + y + ' ' + z for x,y,z in zip(numbers, typo_streets, street_suffs)]\n",
    "test_data = [list(x) for x in zip(typo_full_streets,zips)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Now we can perform address matching"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Create graph\n",
    "sess = tf.Session()\n",
    "\n",
    "# Placeholders\n",
    "test_address = tf.sparse_placeholder( dtype=tf.string)\n",
    "test_zip = tf.placeholder(shape=[None, 1], dtype=tf.float32)\n",
    "ref_address = tf.sparse_placeholder(dtype=tf.string)\n",
    "ref_zip = tf.placeholder(shape=[None, n], dtype=tf.float32)\n",
    "\n",
    "# Declare Zip code distance for a test zip and reference set\n",
    "zip_dist = tf.square(tf.subtract(ref_zip, test_zip))\n",
    "\n",
    "# Declare Edit distance for address\n",
    "address_dist = tf.edit_distance(test_address, ref_address, normalize=True)\n",
    "\n",
    "# Create similarity scores\n",
    "zip_max = tf.gather(tf.squeeze(zip_dist), tf.argmax(zip_dist, 1))\n",
    "zip_min = tf.gather(tf.squeeze(zip_dist), tf.argmin(zip_dist, 1))\n",
    "zip_sim = tf.div(tf.subtract(zip_max, zip_dist), tf.subtract(zip_max, zip_min))\n",
    "address_sim = tf.subtract(1., address_dist)\n",
    "\n",
    "# Combine distance functions\n",
    "address_weight = 0.5\n",
    "zip_weight = 1. - address_weight\n",
    "weighted_sim = tf.add(tf.transpose(tf.multiply(address_weight, address_sim)), tf.multiply(zip_weight, zip_sim))\n",
    "\n",
    "# Predict: Get max similarity entry\n",
    "top_match_index = tf.argmax(weighted_sim, 1)\n",
    "\n",
    "# Function to Create a character-sparse tensor from strings\n",
    "def sparse_from_word_vec(word_vec):\n",
    "    num_words = len(word_vec)\n",
    "    indices = [[xi, 0, yi] for xi,x in enumerate(word_vec) for yi,y in enumerate(x)]\n",
    "    chars = list(''.join(word_vec))\n",
    "    return(tf.SparseTensorValue(indices, chars, [num_words,1,1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Address: 2308 bakar rd, 65480\n",
      "Match  : 2308 baker rd, 65480\n",
      "Address: 709 bakeo pass, 65480\n",
      "Match  : 709 baker pass, 65480\n",
      "Address: 2273 glm ln, 65782\n",
      "Match  : 2273 elm ln, 65782\n",
      "Address: 1843 donner st, 65402\n",
      "Match  : 1843 donner st, 65402\n",
      "Address: 8769 klm st, 65402\n",
      "Match  : 8769 elm st, 65402\n",
      "Address: 3798 dpnner ln, 65012\n",
      "Match  : 3798 donner ln, 65012\n",
      "Address: 2288 bajer pass, 65012\n",
      "Match  : 2288 baker pass, 65012\n",
      "Address: 2416 epm ln, 65480\n",
      "Match  : 2416 elm ln, 65480\n",
      "Address: 543 abgey ave, 65115\n",
      "Match  : 543 abbey ave, 65115\n",
      "Address: 994 abbey st, 65480\n",
      "Match  : 994 abbey st, 65480\n"
     ]
    }
   ],
   "source": [
    "# Loop through test indices\n",
    "reference_addresses = [x[0] for x in reference_data]\n",
    "reference_zips = np.array([[x[1] for x in reference_data]])\n",
    "\n",
    "# Create sparse address reference set\n",
    "sparse_ref_set = sparse_from_word_vec(reference_addresses)\n",
    "\n",
    "for i in range(n):\n",
    "    test_address_entry = test_data[i][0]\n",
    "    test_zip_entry = [[test_data[i][1]]]\n",
    "    \n",
    "    # Create sparse address vectors\n",
    "    test_address_repeated = [test_address_entry] * n\n",
    "    sparse_test_set = sparse_from_word_vec(test_address_repeated)\n",
    "    \n",
    "    feeddict={test_address: sparse_test_set,\n",
    "               test_zip: test_zip_entry,\n",
    "               ref_address: sparse_ref_set,\n",
    "               ref_zip: reference_zips}\n",
    "    best_match = sess.run(top_match_index, feed_dict=feeddict)\n",
    "    best_street = reference_addresses[best_match[0]]\n",
    "    [best_zip] = reference_zips[0][best_match]\n",
    "    [[test_zip_]] = test_zip_entry\n",
    "    print('Address: ' + str(test_address_entry) + ', ' + str(test_zip_))\n",
    "    print('Match  : ' + str(best_street) + ', ' + str(best_zip))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:tf-cpu]",
   "language": "python",
   "name": "conda-env-tf-cpu-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
